library(Seurat)
library(dplyr)
library(gridExtra)
setwd("~/Desktop/PD_scRNAseq/")
dir.create(file.path("Data"), showWarnings = FALSE)
load("Data/seurat_object_add_HTO_ids.Rdata")
pbmc <- seurat.obj
str(pbmc@raw.data)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:19051135] 3 6 8 9 10 12 13 14 16 18 ...
## ..@ p : int [1:27864] 0 10681 11185 13802 16136 18413 20563 22849 25028 27160 ...
## ..@ Dim : int [1:2] 24914 27863
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:24914] "A1BG" "A1BG-AS1" "A2M" "A2M-AS1" ...
## .. ..$ : chr [1:27863] "AAGCAGTGGTATCAAC" "TCTGTCTTCCAGTTTT" "GTCCTCATCCGCATCT" "CGGAGTCAGTGTTAGA" ...
## ..@ x : num [1:19051135] 1 1 1 1 1 1 1 1 1 1 ...
## ..@ factors : list()
library(readxl)
## Update dx and mut fields
### Import updated dx, mut info
subjectInfo <- read_excel("Data/scRNAseq_meta.xlsx")
### Import metadata
meta <- read.table("Data/meta.data3.tsv", header=T )
### Remove last 2 cols (contain incorrect values) and duplicate HTO col
sum(meta$HTO != meta$HTO.1) # double check they're identical
meta["barcode"] = row.names(meta)
unique(meta["HTO"])
# Map incorrect subject IDs to CORRECT subject IDs (typos occurred at NYGC during processing)
map = setNames( # Incorrect (from NYGC)
c("NYUMD0011", "BIMD0076", "MSMD0067", "BIMD0077", "BIMD0007",
"BIMD0075", "NYUMD0015", "MSMD0035","MSMD0207", "BIMD0010"),
# Correct (from Evan)
c("NYUMD0011", "BID0076", "MSMD0067", "BID0077", "BIMD0007",
"BID00075","NYUMD0015", "MSMD0035", "MSMD0207", "BIMD0010")
)
meta["ID"] <- map[unlist(meta["HTO"])]
meta <- subset(meta, select = -c(dx, mut, HTO, HTO.1))
### Merge new cols
metadata <- merge(meta, subjectInfo, by="ID", all.x=T)
row.names(metadata) <- metadata$barcode
colnames(metadata)[colnames(metadata)=="Ethnitcity"] <- "Ethnicity"
dim(meta)
dim(metadata)
head(metadata)
# Correct misspelling
### Export updated Metadata
write.table(metadata, "Data/meta.data4.tsv")
# Add metadata to seurat object
pbmc <- AddMetaData(object = pbmc, metadata = metadata )
pbmc
## % Mitochondrial genes metadata
#mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
#percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)
#pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
metadata <- read.table("Data/meta.data4.tsv")
head(metadata)
## ID nGene nUMI orig.ident singlet.or.not.binary
## AAAGCAAGTTTGTTGG BIMD0007 784 1780 RAJ_13357 1
## TCAGCAATCTTGACGA BIMD0007 742 1854 RAJ_13357 1
## AGCTCCTTCGCGTAGC BIMD0007 495 988 RAJ_13357 1
## TATTACCCACTCTGTC BIMD0007 812 1874 RAJ_13357 1
## CTCGAGGAGCGATTCT BIMD0007 863 2260 RAJ_13357 1
## ATAAGAGCATCAGTCA BIMD0007 803 2034 RAJ_13357 1
## percent.mito res.2 res.1 res.0.6 res.0.3 CellType
## AAAGCAAGTTTGTTGG 0.03146067 3 3 1 0 0
## TCAGCAATCTTGACGA 0.03022126 18 1 0 0 0
## AGCTCCTTCGCGTAGC 0.04048583 14 7 2 3 3
## TATTACCCACTCTGTC 0.04695838 16 12 7 6 6
## CTCGAGGAGCGATTCT 0.02125775 1 0 1 0 0
## ATAAGAGCATCAGTCA 0.02163225 9 8 0 0 0
## barcode dx mut Ethnicity Gender Age
## AAAGCAAGTTTGTTGG AAAGCAAGTTTGTTGG PD PD White M 59
## TCAGCAATCTTGACGA TCAGCAATCTTGACGA PD PD White M 59
## AGCTCCTTCGCGTAGC AGCTCCTTCGCGTAGC PD PD White M 59
## TATTACCCACTCTGTC TATTACCCACTCTGTC PD PD White M 59
## CTCGAGGAGCGATTCT CTCGAGGAGCGATTCT PD PD White M 59
## ATAAGAGCATCAGTCA ATAAGAGCATCAGTCA PD PD White M 59
# Make AgeGroups
makeAgeGroups <- function(){
dim(metadata)
getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit))
ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
AgeGroupsUniq <- c()
for (i in 1:(length(ageBreaks)-1)){
AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-"))
}
data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age,
breaks = ageBreaks,
right = F,
labels = AgeGroupsUniq,
nclude.lowest=T)]
metadata <- data.frame(metadata)
unique(metadata$AgeGroups)
head(metadata)
dim(metadata)
return(metadata)
}
# metadata <- makeAgeGroups()
pbmc <- AddMetaData(object = pbmc, metadata = metadata)
# Get rid of any NAs (cells that don't match up with the metadata)
pbmc <- FilterCells(object = pbmc, subset.names = "nGene", low.thresholds = 0,
# Subset for testing
cells.use = pbmc@cell.names[0:2000]
)
head(pbmc@meta.data)
## nGene nUMI orig.ident singlet.or.not.binary HTO
## GTCATTTTCCGCATCT 2035 6729 RAJ_13357 1 NYUMD0011
## ATAAGAGAGGAGTTGC 1914 6718 RAJ_13357 1 BID0076
## CATTATCCATGGTCTA 1971 6156 RAJ_13357 1 MSMD0067
## CTTAGGAGTGGCGAAT 1893 6392 RAJ_13357 1 NYUMD0011
## CATCAGAAGCACCGCT 1967 6110 RAJ_13357 1 MSMD0067
## GCGGGTTAGTAGCCGA 1868 5977 RAJ_13357 1 NYUMD0011
## ID percent.mito res.2 res.1 res.0.6 res.0.3
## GTCATTTTCCGCATCT MSMD0207 0.04191439 0 1 0 0
## ATAAGAGAGGAGTTGC BIMD0076 0.04362066 0 1 0 0
## CATTATCCATGGTCTA NYUMD0015 0.03086921 0 1 0 0
## CTTAGGAGTGGCGAAT MSMD0207 0.03613892 0 1 0 0
## CATCAGAAGCACCGCT NYUMD0015 0.04437531 19 10 8 0
## GCGGGTTAGTAGCCGA MSMD0207 0.03514056 6 2 0 0
## CellType barcode dx mut Ethnicity
## GTCATTTTCCGCATCT 0 GTCATTTTCCGCATCT PD PD White
## ATAAGAGAGGAGTTGC 0 ATAAGAGAGGAGTTGC PD GBA White
## CATTATCCATGGTCTA 0 CATTATCCATGGTCTA control control White
## CTTAGGAGTGGCGAAT 0 CTTAGGAGTGGCGAAT PD PD White
## CATCAGAAGCACCGCT 0 CATCAGAAGCACCGCT control control White
## GCGGGTTAGTAGCCGA 0 GCGGGTTAGTAGCCGA PD PD White
## Gender Age
## GTCATTTTCCGCATCT M 71
## ATAAGAGAGGAGTTGC M 47
## CATTATCCATGGTCTA M 51
## CTTAGGAGTGGCGAAT M 71
## CATCAGAAGCACCGCT M 51
## GCGGGTTAGTAGCCGA M 71
VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)
par(mfrow = c(1, 2))
GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "percent.mito")
GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "nGene")
pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"),
low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize",
scale.factor = 10000)
# Store the top most variable genes in @var.genes
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
##
## Time Elapsed: 14.3587610721588 secs
ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above
# Run PCA with only the top most variables genes
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5,
genes.print = 5)
## [1] "PC1"
## [1] "S100A12" "VCAN" "MS4A6A" "CD14" "RNU1-60P"
## [1] ""
## [1] "FCGR3A" "IFITM3" "CDKN1C" "IFITM2" "RHOC"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "FCER1A" "HLA-DQB1" "CLEC10A" "HLA-DPB1" "HLA-DQA1"
## [1] ""
## [1] "S100A12" "RNU1-60P" "RNVU1-13" "CFD" "S100A11"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "RNVU1-13" "HLA-DPB1" "RNVU1-19" "HLA-DPA1" "RNU1-60P"
## [1] ""
## [1] "S100A10" "S100A12" "SNHG5" "hsa-mir-6723"
## [5] "VCAN"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "hsa-mir-6723" "HES1" "TIPARP" "FOLR3"
## [5] "SNHG5"
## [1] ""
## [1] "RP4-765C7.2" "CTB-118N6.1" "RP11-343H5.4" "HLA-DRB5"
## [5] "MNDA"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "C1orf56" "HNRNPH1" "FGD2" "CDC42SE1" "CTNNB1"
## [1] ""
## [1] "RP4-765C7.2" "CTB-118N6.1" "RP11-343H5.4" "ANAPC7"
## [5] "SNORD3B-1"
## [1] ""
## [1] ""
# PCA plots
VizPCA(object = pbmc, pcs.use = 1:2)
PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)
pbmc <- ProjectPCA(object = pbmc, do.print = FALSE)
## PCA Heatmap: PC1
PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced = TRUE, label.columns = FALSE)
## PCA Heatmap: PC1-PCn
PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE,
label.columns = FALSE, use.full = FALSE)
NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time
#pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = pbmc)
We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = 0.6, print.output = 0, save.SNN = TRUE)
PrintFindClustersParams(object = pbmc)
## Parameters used in latest FindClusters calculation run on: 2018-12-20 16:49:37
## =============================================================================
## Resolution: 0.6
## -----------------------------------------------------------------------------
## Modularity Function Algorithm n.start n.iter
## 1 1 100 10
## -----------------------------------------------------------------------------
## Reduction used k.param prune.SNN
## pca 30 0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.
labSize <- 6
pbmc <- StashIdent(object = pbmc, save.name = "pre_clustering")
#pbmc <- SetAllIdent(object = pbmc, id = "pre_clustering")
try({
pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = pbmc, do.label=T, label.size = labSize)
})
metaVars <- c("CellType","dx","mut","Gender","Age")
for (var in metaVars){
print(paste("t-SNE Metadata plot for ",var))
# Metadata plot
p1 <- TSNEPlot(pbmc, do.return = T, pt.size = 0.5, group.by = var, do.label = T,
dark.theme=F, plot.title=paste("Color by ",var))
# t-SNE clusters plot
p2 <- TSNEPlot(pbmc, do.label = T, do.return = T, pt.size = 0.5, plot.title=paste("Color by t-SNE clusters"))
print(plot_grid(p1, p2))
}
## [1] "t-SNE Metadata plot for CellType"
## [1] "t-SNE Metadata plot for dx"
## [1] "t-SNE Metadata plot for mut"
## [1] "t-SNE Metadata plot for Gender"
## [1] "t-SNE Metadata plot for Age"
### Biomarkers: One Cluster vs. Specific Clusters
# cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = c(2),
# min.pct = 0.25)
# print(x = head(x = cluster5.markers, n = 3))
### Biomarkers: One Cluster vs. All Other Clusters
# find all markers of a given cluster
# MUST run FindClusters() first
# cluster0.markers <- FindMarkers(object = pbmc, ident.1 = 0, min.pct = 0.25)
# print(x = head(x = cluster0.markers, n = 3))
# find markers for every cluster compared to all remaining cells, report
# only the positive ones
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
topBiomarkers <- pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
topBiomarkers
## # A tibble: 10 x 7
## # Groups: cluster [5]
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 3.39e-192 1.05 1 1 8.45e-188 0 S100A8
## 2 3.79e-152 1.15 0.946 0.605 9.45e-148 0 S100A12
## 3 4.43e-206 1.73 0.884 0.16 1.10e-201 1 FCGR3A
## 4 1.09e- 58 1.25 0.374 0.074 2.72e- 54 1 C1QA
## 5 3.52e- 46 0.514 0.994 0.896 8.77e- 42 2 HLA-DPB1
## 6 1.25e- 14 0.776 0.261 0.111 3.11e- 10 2 HES1
## 7 7.00e-208 2.03 0.716 0.044 1.74e-203 3 FCER1A
## 8 2.69e-104 1.37 0.616 0.103 6.70e-100 3 CLEC10A
## 9 1.46e-138 1.87 0.914 0.11 3.64e-134 4 RP4-765C7.2
## 10 1.25e-103 1.13 0.625 0.057 3.11e- 99 4 CTB-118N6.1
Seurat has several tests for differential expression which can be set with the test.use parameter (see the DE vignette for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).
getTopBiomarker <- function(pbmc.markers, clusterID, topN=1){
df <- subset(pbmc.markers, p_val_adj<0.05 & cluster==as.character(clusterID)) %>% arrange(desc(avg_logFC))
top_pct_markers <- df[1:topN,"gene"]
return(top_pct_markers)
}
clust1_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=1, topN=2)
clust2_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=2, topN=2)
# Top Biomarkers for
plotBiomarkers <- function(pbmc, biomarkers, cluster){
biomarkerPlots <- list()
for (marker in biomarkers){
print(marker)
p <- VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T)
biomarkerPlots[[marker]] <- p + ggplot2::aes(alpha=.7)
}
combinedPlot <- do.call(grid.arrange, c(biomarkerPlots,
list(ncol=2, top=paste("Top DEG Biomarkers for Cluster",cluster))) )
return(combinedPlot)
}
# Plot top 2 biomarker genes for each
for (clust in unique(pbmc.markers$cluster)){
biomarkers <- getTopBiomarker(pbmc.markers, clusterID=clust, topN=2)
plotBiomarkers(pbmc, biomarkers, clust)
}
## [1] "S100A12"
## [1] "S100A8"
## [1] "FCGR3A"
## [1] "C1QA"
## [1] "HES1"
## [1] "HLA-DPB1"
## [1] "FCER1A"
## [1] "CLEC10A"
## [1] "RP4-765C7.2"
## [1] "CTB-118N6.1"
FeaturePlot(object = pbmc, features.plot = topBiomarkers$gene, cols.use = c("grey", "blue"),
reduction.use = "tsne")
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
# setting slim.col.label to TRUE will print just the cluster IDS instead of
# every cell name
DoHeatmap(object = pbmc, genes.use = top10$gene, slim.col.label = TRUE, remove.key = TRUE)
current.cluster.ids <- unique(pbmc.markers$cluster) #c(0, 1, 2, 3, 4, 5, 6, 7)
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
new.cluster.ids <- top1$gene #c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object = pbmc, do.label = TRUE, pt.size = 0.5)
If you perturb some of our parameter choices above (for example, setting resolution=0.8 or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite object@ident), we can stash our renamed identities to be easily recovered later.
# First lets stash our identities for later
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
# Note that if you set save.snn=T above, you don't need to recalculate the
# SNN, and can simply put: pbmc <- FindClusters(pbmc,resolution = 0.8)
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = 0.8, print.output = FALSE)
## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
# Demonstration of how to plot two tSNE plots side by side, and how to color
# points based on different criteria
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6",
no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot_grid(plot1, plot2)
# Find discriminating markers
tcell.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = 1)
# Most of the markers tend to be expressed in C1 (i.e. S100A4). However, we
# can see that CCR7 is upregulated in C0, strongly indicating that we can
# differentiate memory from naive CD4 cells. cols.use demarcates the color
# palette from low to high expression
FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("green", "blue"))
pbmc <- SetAllIdent(object = pbmc, id = "ClusterNames_0.6")
saveRDS(pbmc, file = "Data/cd14-processed.rds")